Cleaning up environmental salinity data: Going to remove flagged data first and then hopefully I can use a 7 day rolling mean window to remove outliers. Using rolling mean to remove data points that are +/- 2 standard deviations away from mean.
EOS pier/Tiburon
Clear environment, load packages, read in data.
rm(list=ls())
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
library(ggpubr)
library(scales)
library(chron)
library(plotly)
library(taRifx)
library(aweek)
library(easypackages)
## Warning: package 'easypackages' was built under R version 4.0.3
library(renv)
## Warning: package 'renv' was built under R version 4.0.3
library(here)
## Warning: package 'here' was built under R version 4.0.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.3
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.0.3
library(tidyquant)
## Warning: package 'tidyquant' was built under R version 4.0.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.0.3
## Warning: package 'quantmod' was built under R version 4.0.3
library(recipes)
library(cranlogs)
## Warning: package 'cranlogs' was built under R version 4.0.3
library(knitr)
eos<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/eos_bouy_r.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)
Convert to date column
eos$date<-as.Date(eos$date, format = c("%m/%d/%Y"))
Subset by year
eos17<-eos[- grep("2017", eos$date),]
eos18<-eos[- grep("2018", eos$date),]
eos19<-eos[- grep("2019", eos$date),]
Plot raw data by year to view trends
all<-eos%>%
ggplot(aes(x=date, y=salinity, color=salinity))+
geom_point(alpha=0.5)+ylim(0,35)+
labs(title="Salinity data from EOS resesarch pier",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of CeNCOOS")
e17<-eos17%>%
ggplot(aes(x=date, y=salinity, color=salinity))+
geom_point(alpha=0.5)+ylim(0,35)+
labs(title="Salinity data from EOS resesarch pier",
subtitle="01/01/2017 - 12/31/2017",
caption= "data courtesy of CeNCOOS")
e18<-eos18%>%
ggplot(aes(x=date, y=salinity, color=salinity))+
geom_point(alpha=0.5)+ylim(0,35)+
labs(title="Salinity data from EOS resesarch pier",
subtitle="01/01/2018 - 12/31/2018",
caption= "data courtesy of CeNCOOS")
e19<-eos19%>%
ggplot(aes(x=date, y=salinity, color=salinity))+
geom_point(alpha=0.5)+ylim(0,35)+
labs(title="Salinity data from EOS resesarch pier",
subtitle="01/01/2019 - 12/31/2019",
caption= "data courtesy of CeNCOOS")
ggarrange(all, e17, e18,e19, ncol= 1, nrow = 4)
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
Remove flagged data –> don’t see for this data set. Going to apply a filter to get rid of values greater than 35. This will get rid of the two largest outliers.
eos<-filter(eos, salinity <35)
Make function
custom_stat_fun_2<-function(x, na.rm=TRUE){
m<-mean(x, na.rm=TRUE)
s<- sd(x, na.rm=TRUE)
hi<-m+2*s
lo<-m-2*s
ret<-c(mean=m, stdev=s, hi.95=hi, lo.95=lo)
}
Roll apply using custom stat function. Width will depend on the frequencey of the time series. Data collected every 6 minutes so width=1680 is 7 days, 3360 is 14 days, 7200 is 30 days. 240 observations per day.
Rolling 7 day mean:
eos_week<-eos%>%
tq_mutate(
select= salinity,
mutate_fun= rollapply,
width= 1680,
align= "right",
by.column= FALSE,
FUN= custom_stat_fun_2,
na.rm= TRUE)
## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique
Filter and graph. Top=raw, bottom = filtered
eos_clean_data<-filter(eos_week, salinity <hi.95 & salinity >lo.95)
eos_graph<- ggplot(data = eos_clean_data, mapping = aes(x = date, y = salinity, color=salinity)) + geom_point()+
labs(title="Salinity data from EOS resesarch pier REMOVED",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of CeNCOOS")
eos_raw<-eos%>%
ggplot(aes(x=date, y=salinity, color=salinity))+
geom_point(alpha=0.5)+ylim(0,35)+
labs(title="Salinity data from EOS resesarch pier ALL",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of CeNCOOS")
ggarrange(eos_raw, eos_graph, ncol= 1, nrow = 2)
Richardson Bay
Clear environment except for what we need “eos_graph” and the function we made “custom_stat_fun_2”. Read in data.
rm(list=ls()[! ls() %in% c("eos_graph","custom_stat_fun_2")])
rb<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/richardson_bay_all.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)
Convert to date
rb$date<-as.Date(rb$DateTimeStamp, format = c("%m/%d/%y"))
Subset by year
rb17<-rb[- grep("2017", rb$date),]
rb18<-rb[- grep("2018", rb$date),]
rb19<-rb[- grep("2019", rb$date),]
Plot raw data
rball<-rb%>%
ggplot(aes(x=date, y=Sal, color=Sal))+
geom_point(alpha=0.5)+
labs(title="Salinity data from Richardson Bay",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of NERR")
r17<-rb17%>%
ggplot(aes(x=date, y=Sal, color=Sal))+
geom_point(alpha=0.5)+
labs(title="Salinity data from Richardson Bay",
subtitle="01/01/2017 - 12/31/2017",
caption= "data courtesy of NERR")
r18<-rb18%>%
ggplot(aes(x=date, y=Sal, color=Sal))+
geom_point(alpha=0.5)+
labs(title="Salinity data from Richardson Bay",
subtitle="01/01/2018 - 12/31/2018",
caption= "data courtesy of NERR")
r19<-rb19%>%
ggplot(aes(x=date, y=Sal, color=Sal))+
geom_point(alpha=0.5)+
labs(title="Salinity data from Richardson Bay",
subtitle="01/01/2019 - 12/31/2019",
caption= "data courtesy of NERR")
ggarrange(rball, r17, r18,r19, ncol= 1, nrow = 4)
## Warning: Removed 1496 rows containing missing values (geom_point).
## Warning: Removed 1435 rows containing missing values (geom_point).
## Warning: Removed 1069 rows containing missing values (geom_point).
## Warning: Removed 488 rows containing missing values (geom_point).
Removed flagged data that did not pass QAQC. F_sal with a -3 indicates fail
rb<-rb[- grep("-3", rb$F_Sal),]
Roll apply using custom stat function. Again, width will depend on the frequencey of the time series. Data collected every 15 min so width = 672 is 7 days, 1344 is 14 days, 2880 is 30 days. 96 observations per day. Since 30 days worked for eos data I’m going to the same here so that it’s constent.
7 day window
rb_week<-rb%>%
tq_mutate(
select= Sal,
mutate_fun= rollapply,
width= 672,
align= "right",
by.column= FALSE,
FUN= custom_stat_fun_2,
na.rm= TRUE)
## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique
Filter and graph. Top = raw, bottom = filtered
rb_clean_data<-filter(rb_week, Sal <hi.95 & Sal >lo.95)
rb_graph<- ggplot(data = rb_clean_data, mapping = aes(x = date, y = Sal, color=Sal)) + geom_point()+
labs(title="Salinity data from Richardson Bay REMOVED",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of NERR")
rb_all<-ggplot(data = rb, mapping = aes(x = date, y = Sal, color=Sal)) + geom_point()+
labs(title="Salinity data from Richardson Bay RAW",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of NERR")
ggarrange(rb_all, rb_graph, ncol= 1, nrow = 2)
## Warning: Removed 1496 rows containing missing values (geom_point).
China Camp
Clear environment. We need to keep “eos_graph”, “rb_graph”, and “custom_stat_fun_2”. Read in data.
rm(list=ls()[! ls() %in% c("eos_graph","custom_stat_fun_2", "rb_graph")])
cc<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/china_camp_all.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)
Convert to date column
cc$date<-as.Date(cc$DateTimeStamp, format = c("%m/%d/%Y"))
Subset by year
cc17<-cc[- grep("2017", cc$date),]
cc18<-cc[- grep("2018", cc$date),]
cc19<-cc[- grep("2019", cc$date),]
Plot raw data
ccall<-cc%>%
ggplot(aes(x=date, y=Sal, color=Sal))+
geom_point(alpha=0.5)+
labs(title="Salinity data from China Camp",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of NERR")
c17<-cc17%>%
ggplot(aes(x=date, y=Sal, color=Sal))+
geom_point(alpha=0.5)+
labs(title="Salinity data from China Camp",
subtitle="01/01/2017 - 12/31/2017",
caption= "data courtesy of NERR")
c18<-cc18%>%
ggplot(aes(x=date, y=Sal, color=Sal))+
geom_point(alpha=0.5)+
labs(title="Salinity data from China Camp",
subtitle="01/01/2018 - 12/31/2018",
caption= "data courtesy of NERR")
c19<-cc19%>%
ggplot(aes(x=date, y=Sal, color=Sal))+
geom_point(alpha=0.5)+
labs(title="Salinity data from China Camp",
subtitle="01/01/2019 - 12/31/2019",
caption= "data courtesy of NERR")
ggarrange(ccall, c17, c18,c19, ncol= 1, nrow = 4)
## Warning: Removed 835 rows containing missing values (geom_point).
## Warning: Removed 315 rows containing missing values (geom_point).
## Warning: Removed 523 rows containing missing values (geom_point).
## Warning: Removed 832 rows containing missing values (geom_point).
Remove flagged data
cc<-cc[- grep("-3", cc$F_Sal),]
Roll apply using custom stat function. Again, width will depend on the frequencey of the time series. Data collected every 15 min so width = 672 is 7 days, 1344 is 14 days, 2880 is 30 days. 96 observations per day. Since 30 days worked for eos data I’m going to the same here so that it’s constent.
7 day window
cc_week<-cc%>%
tq_mutate(
select= Sal,
mutate_fun= rollapply,
width= 672,
align= "right",
by.column= FALSE,
FUN= custom_stat_fun_2,
na.rm= TRUE)
## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique
Filter and graph. Top=raw, bottom = filtered
cc_clean_data<-filter(cc_week, Sal <hi.95 & Sal >lo.95)
cc_graph<- ggplot(data = cc_clean_data, mapping = aes(x = date, y = Sal, color=Sal)) + geom_point()+
labs(title="Salinity data from China Camp REMOVED",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of NERR")
cc_raw<-ggplot(data = cc, mapping = aes(x = date, y = Sal, color=Sal)) + geom_point()+
labs(title="Salinity data from China Camp RAW",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of NERR")
ggarrange(cc_raw, cc_graph, ncol=1, nrow=2)
## Warning: Removed 835 rows containing missing values (geom_point).
Fort Point
Clear environment. We need to keep “eos_graph”, “rb_graph”, “cc_graph” and “custom_stat_fun_2”. Read in data.
rm(list=ls()[! ls() %in% c("eos_graph","custom_stat_fun_2", "rb_graph","cc_graph")])
fp<-read.csv("C:/Users/chels/Box Sync/Thesis/Data/Working data/Bouy data/fort_point_r.csv", header = TRUE, sep=",", fileEncoding="UTF-8-BOM", stringsAsFactors = FALSE)
Convert to date column
fp$date<-as.Date(fp$date, format = c("%Y-%m-%d"))
Subset by year
fp17<-fp[- grep("2017", fp$date),]
fp18<-fp[- grep("2018", fp$date),]
fp19<-fp[- grep("2019", fp$date),]
Plot raw data
fpall<-fp%>%
ggplot(aes(x=date, y=sea_water_practical_salinity, color=sea_water_practical_salinity))+
geom_point(alpha=0.5)+
labs(title="Salinity data from Fort Point",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of CeNCOOS")
f17<-fp17%>%
ggplot(aes(x=date, y=sea_water_practical_salinity, color=sea_water_practical_salinity))+
geom_point(alpha=0.5)+
labs(title="Salinity data from Fort Point",
subtitle="01/01/2017 - 12/31/2017",
caption= "data courtesy of CeNCOOS")
f18<-fp18%>%
ggplot(aes(x=date, y=sea_water_practical_salinity, color=sea_water_practical_salinity))+
geom_point(alpha=0.5)+
labs(title="Salinity data from Fort Point",
subtitle="01/01/2018 - 12/31/2018",
caption= "data courtesy of CeNCOOS")
f19<-fp19%>%
ggplot(aes(x=date, y=sea_water_practical_salinity, color=sea_water_practical_salinity))+
geom_point(alpha=0.5)+
labs(title="Salinity data from Fort Point",
subtitle="01/01/2019 - 12/31/2019",
caption= "data courtesy of CeNCOOS")
ggarrange(fpall, f17, f18, f19, ncol=1, nrow=4)
Removed flagged data:sea_water_practical_salinity_qc_tests = 4 indicates FAIL.
fp<-filter(fp, sea_water_practical_salinity_qc_agg!=4)
Roll apply using custom stat function. Data collected every 6 minutes so width=1680 is 7 days, 3360 is 14 days, 7200 is 30 days. 240 observations per day.
7 day window
fpweek<-fp%>%
tq_mutate(
select= sea_water_practical_salinity,
mutate_fun= rollapply,
width= 1680,
align= "right",
by.column= FALSE,
FUN= custom_stat_fun_2,
na.rm= TRUE)
## Warning in zoo(y, order.by = index(x), ...): some methods for "zoo" objects do
## not work if the index entries in 'order.by' are not unique
Filter and graph. Raw=top, filtered=bottom
fp_clean_data<-filter(fpweek, sea_water_practical_salinity <hi.95 & sea_water_practical_salinity >lo.95)
fp_graph<- ggplot(data = fp_clean_data, mapping = aes(x = date, y = sea_water_practical_salinity, color=sea_water_practical_salinity)) + geom_point()+
labs(title="Salinity data from Fort Point REMOVED",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of CeNCOOS")
fp_raw<-ggplot(data = fp, mapping = aes(x = date, y = sea_water_practical_salinity, color=sea_water_practical_salinity)) + geom_point()+
labs(title="Salinity data from Fort Point RAW",
subtitle="01/01/2017 - 12/31/2019",
caption= "data courtesy of CeNCOOS")
ggarrange(fp_raw, fp_graph, ncol=1, nrow=2)
All together Now that I have all my graphs, I’m going to stack them. This is the salinity data from 01/01/2017-12/31/2019. I first removed data points that failed QC tests. I was able to do this for all sites except for EOS pier where I didn’t see that column available for download. Instead I just filtered to remove salinity points greater than 100 which looks like it worked. Next step is to get hourly averages.
ggarrange(cc_graph, eos_graph,rb_graph,fp_graph, ncol= 1, nrow = 4)